
clear all
set more off

global dir /Volumes/Zihao_SSD2/PatentsView

*** This part cleans and extracts examiners' firstname and lastname
*** Zihao Li. 06/2024
import delimited $dir/rawdata/g_examiner_not_disambiguated.tsv, varnames(1) clear 
keep patent_id raw_examiner_name_first raw_examiner_name_last examiner_role
rename (raw_examiner_name_first raw_examiner_name_last) (first_name last_name)
sort first_name
drop if first_name == ""
format %30s patent_id first_name last_name
replace first_name = strltrim(first_name)
replace first_name = strrtrim(first_name)
replace last_name = strltrim(last_name)
replace last_name = strrtrim(last_name)

* Clean suffixes
replace last_name = subinstr(last_name, ", Jr.", "", .)
replace last_name = subinstr(last_name, ", Jr,", "", .)
replace last_name = subinstr(last_name, ", Jr", "", .)
replace last_name = subinstr(last_name, ". Jr.", "", .)
replace last_name = subinstr(last_name, ". Jr", "", .)
replace last_name = subinstr(last_name, " Jr.", "", .)
replace last_name = substr(last_name, 1, length(last_name)-3) if substr(last_name, -3, .) == " Jr"
replace last_name = subinstr(last_name, ", JR.", "", .)
replace last_name = subinstr(last_name, ", V", "", .)
replace last_name = subinstr(last_name, ", IV.", "", .)
replace last_name = subinstr(last_name, ", IV", "", .)
replace last_name = subinstr(last_name, " IV", "", .)
replace last_name = subinstr(last_name, ", III.", "", .)
replace last_name = subinstr(last_name, ", III", "", .)
replace last_name = subinstr(last_name, ",III", "", .)
replace last_name = subinstr(last_name, " III", "", .)
replace last_name = subinstr(last_name, ", II.", "", .)
replace last_name = subinstr(last_name, ", II", "", .)
replace last_name = subinstr(last_name, ",II", "", .)
replace last_name = subinstr(last_name, " II", "", .)
replace last_name = subinstr(last_name, ", I", "", .)
replace last_name = subinstr(last_name, ", Sr.", "", .)
replace last_name = subinstr(last_name, ", SR.", "", .)
replace last_name = subinstr(last_name, "0'", "O'", .)

gen examiner_name = first_name + " " + last_name

* Clean special characters (this is a comprehensive list)
replace examiner_name = subinstr(examiner_name, "?", "", .)
replace examiner_name = subinstr(examiner_name, "!", "", .)
replace examiner_name = subinstr(examiner_name, "(", "", .)
replace examiner_name = subinstr(examiner_name, ")", "", .)
replace examiner_name = subinstr(examiner_name, "{", "", .)
replace examiner_name = subinstr(examiner_name, "}", "", .)
replace examiner_name = subinstr(examiner_name, "[", "", .)
replace examiner_name = subinstr(examiner_name, "]", "", .)
replace examiner_name = subinstr(examiner_name, "/ ", "", .)
replace examiner_name = subinstr(examiner_name, "/", "", .)
replace examiner_name = subinstr(examiner_name, "'", "", .)
replace examiner_name = subinstr(examiner_name, `"""', "", .)
replace examiner_name = subinstr(examiner_name, ";", "", .)
replace examiner_name = subinstr(examiner_name, "¸", "", .)
replace examiner_name = subinstr(examiner_name, "¥", "", .)
replace examiner_name = subinstr(examiner_name, "¶", "", .)
replace examiner_name = subinstr(examiner_name, "¨", "", .)
replace examiner_name = subinstr(examiner_name, "¤", "", .)
replace examiner_name = subinstr(examiner_name, "±", "", .)
replace examiner_name = subinstr(examiner_name, "&", "", .)
replace examiner_name = subinstr(examiner_name, "§a", "", .)
replace examiner_name = subinstr(examiner_name, "A", "Anel", .)

replace examiner_name = subinstr(examiner_name, "å", "a", .)
replace examiner_name = subinstr(examiner_name, "á", "a", .)
replace examiner_name = subinstr(examiner_name, "à", "a", .)
replace examiner_name = subinstr(examiner_name, "ä", "a", .)
replace examiner_name = subinstr(examiner_name, "æ", "ae", .)
replace examiner_name = subinstr(examiner_name, "ã", "a", .)
replace examiner_name = subinstr(examiner_name, "â", "a", .)
replace examiner_name = subinstr(examiner_name, "a¹", "a", .)
replace examiner_name = subinstr(examiner_name, "Á", "A", .)
replace examiner_name = subinstr(examiner_name, "Å", "A", .)
replace examiner_name = subinstr(examiner_name, "Ä°", "A", .)
replace examiner_name = subinstr(examiner_name, "Ä", "A", .)
replace examiner_name = subinstr(examiner_name, "Ã", "A", .)
replace examiner_name = subinstr(examiner_name, "Ā", "A", .)
replace examiner_name = subinstr(examiner_name, "Ã", "A", .)
replace examiner_name = subinstr(examiner_name, "Â", "A", .)
replace examiner_name = subinstr(examiner_name, "Ã³", "A", .)
replace examiner_name = subinstr(examiner_name, "A³", "A", .)
replace examiner_name = subinstr(examiner_name, "Ã¼", "A", .)
replace examiner_name = subinstr(examiner_name, "A¼", "A", .)


replace examiner_name = subinstr(examiner_name, "ß", "b", .)

replace examiner_name = subinstr(examiner_name, "č", "c", .)
replace examiner_name = subinstr(examiner_name, "ć", "c", .)
replace examiner_name = subinstr(examiner_name, "c̆", "c", .)
replace examiner_name = subinstr(examiner_name, "Ç", "C", .)
replace examiner_name = subinstr(examiner_name, "ç", "c", .)
replace examiner_name = subinstr(examiner_name, "¢", "c", .)
replace examiner_name = subinstr(examiner_name, "Č", "C", .)
replace examiner_name = subinstr(examiner_name, "©", "c", .)
replace examiner_name = subinstr(examiner_name, "Ç", "C", .)

replace examiner_name = subinstr(examiner_name, "É", "E", .)
replace examiner_name = subinstr(examiner_name, "È", "E", .)
replace examiner_name = subinstr(examiner_name, "é", "e", .)
replace examiner_name = subinstr(examiner_name, "è", "e", .)
replace examiner_name = subinstr(examiner_name, "ě", "e", .)
replace examiner_name = subinstr(examiner_name, "ë", "e", .)
replace examiner_name = subinstr(examiner_name, "ȩ", "e", .)

replace examiner_name = subinstr(examiner_name, "ǧ", "g", .)
replace examiner_name = subinstr(examiner_name, "ğ", "g", .)
replace examiner_name = subinstr(examiner_name, "ǧ̃", "g", .)
replace examiner_name = subinstr(examiner_name, "g̃", "g", .)
replace examiner_name = subinstr(examiner_name, "gˇ", "g", .)

replace examiner_name = subinstr(examiner_name, "İ", "I", .)
replace examiner_name = subinstr(examiner_name, "Î", "I", .)
replace examiner_name = subinstr(examiner_name, "Í", "I", .)
replace examiner_name = subinstr(examiner_name, "Ì", "I", .)
replace examiner_name = subinstr(examiner_name, "i̇", "i", .)
replace examiner_name = subinstr(examiner_name, "ı̈", "i", .)
replace examiner_name = subinstr(examiner_name, "ï", "i", .)
replace examiner_name = subinstr(examiner_name, "ı¨", "i", .)
replace examiner_name = subinstr(examiner_name, "ı́", "i", .)
replace examiner_name = subinstr(examiner_name, "í", "i", .)
replace examiner_name = subinstr(examiner_name, "ĭ", "i", .)
replace examiner_name = subinstr(examiner_name, "iˇ", "i", .)
replace examiner_name = subinstr(examiner_name, "iˆ", "i", .)
replace examiner_name = subinstr(examiner_name, "ı", "i", .)
replace examiner_name = subinstr(examiner_name, "î", "i", .)
replace examiner_name = subinstr(examiner_name, "¡", "i", .)

replace examiner_name = subinstr(examiner_name, "Ł", "L", .)
replace examiner_name = subinstr(examiner_name, "Ľ", "L", .)
replace examiner_name = subinstr(examiner_name, "ł", "l", .)

replace examiner_name = subinstr(examiner_name, "Ñ", "N", .)
replace examiner_name = subinstr(examiner_name, "ń", "n", .)
replace examiner_name = subinstr(examiner_name, "ñ", "n", .)

replace examiner_name = subinstr(examiner_name, "Ø", "O", .)
replace examiner_name = subinstr(examiner_name, "Ó", "O", .)
replace examiner_name = subinstr(examiner_name, "Ö", "O", .)
replace examiner_name = subinstr(examiner_name, "Ò", "O", .)
replace examiner_name = subinstr(examiner_name, "Ō", "O", .)
replace examiner_name = subinstr(examiner_name, "ö", "o", .)
replace examiner_name = subinstr(examiner_name, "Ó", "O", .)
replace examiner_name = subinstr(examiner_name, "ó", "o", .)
replace examiner_name = subinstr(examiner_name, "ò", "o", .)
replace examiner_name = subinstr(examiner_name, "ø", "o", .)
replace examiner_name = subinstr(examiner_name, "ô", "o", .)
replace examiner_name = subinstr(examiner_name, "ő", "o", .)
replace examiner_name = subinstr(examiner_name, "œ", "oe", .)

replace examiner_name = subinstr(examiner_name, "ř", "r", .)

replace examiner_name = subinstr(examiner_name, "š", "s", .)
replace examiner_name = subinstr(examiner_name, "ş", "s", .)
replace examiner_name = subinstr(examiner_name, "Š", "S", .)
replace examiner_name = subinstr(examiner_name, "Ş", "S", .)
replace examiner_name = subinstr(examiner_name, "Ś", "S", .)

replace examiner_name = subinstr(examiner_name, "ü", "u", .)
replace examiner_name = subinstr(examiner_name, "ú", "u", .)
replace examiner_name = subinstr(examiner_name, "ü", "u", .)
replace examiner_name = subinstr(examiner_name, "Ü", "U", .)
replace examiner_name = subinstr(examiner_name, "Ú", "U", .)

replace examiner_name = subinstr(examiner_name, "ý", "y", .)

replace examiner_name = subinstr(examiner_name, "Ž", "Z", .)
replace examiner_name = subinstr(examiner_name, "ž", "z", .)


** Extract examiner FirstName
replace examiner_name = strltrim(examiner_name)
replace examiner_name = strrtrim(examiner_name)
split examiner_name, parse(" ")

* Add spaces after dots if they aren't there already. Add dots to middle initials.
forval y = 1/6 {
	replace examiner_name`y' = subinstr(examiner_name`y', ".", ". ", .)
	replace examiner_name`y' = strrtrim(examiner_name`y')
	replace examiner_name`y' = examiner_name`y' + "." if ustrregexm(examiner_name`y', "[A-Z]$") & ustrlen(examiner_name`y') == 1
}

* Reshape into a full_name format
sort examiner_name
replace examiner_name = examiner_name1
forval y = 2/6 {
	replace examiner_name = examiner_name + " " + examiner_name`y' if !missing(examiner_name`y')
}
drop examiner_name1 examiner_name2 examiner_name3 examiner_name4 examiner_name5 examiner_name6

replace examiner_name = proper(examiner_name)
replace examiner_name = substr(examiner_name, 3, .) if substr(examiner_name, 1, 2) == ". "

* Change x. xxx xxx into xxx x. xxx
split examiner_name, parse(" ")
gen examiner_namenew1 = examiner_name1

replace examiner_namenew1 = examiner_name2 if ustrregexm(examiner_name1, "^[A-Z]\.")==1 & ustrregexm(examiner_name2, "[A-Za-z]\.") == 0 & lower(examiner_name2) != "de" & lower(examiner_name2) != "van" & lower(examiner_name2) != "von" & lower(examiner_name2) != "la" & examiner_name3 != ""

replace examiner_name2 = examiner_name1 if examiner_name1 != examiner_namenew1

gen examiner_namenew = examiner_namenew1
forval y = 2/6 {
	replace examiner_namenew = examiner_namenew + " " + examiner_name`y' if !missing(examiner_name`y')
}

order patent_id examiner_name examiner_namenew examiner_namenew1
drop examiner_name
rename examiner_namenew examiner_name

* Generate last name (last non-missing entry)
gen last_name_clean = ""
forval y = 1/6 {
	replace last_name_clean = examiner_name`y' if !missing(examiner_name`y') 
}
* Generate first name
rename examiner_namenew1 first_name_clean
format %30s examiner_name

drop examiner_name1 examiner_name2 examiner_name3 examiner_name4 examiner_name5 examiner_name6

order patent_id examiner_name first_name first_name_clean last_name last_name_clean
replace examiner_name = substr(examiner_name, 2, .) if substr(examiner_name, 1, 1) == "-"
replace first_name_clean = substr(first_name_clean, 2, .) if substr(first_name_clean, 1, 1) == "-"
replace last_name_clean = substr(last_name_clean, 2, .) if substr(last_name_clean, 1, 1) == "-"

* Individual fixes for last names
replace last_name_clean = "Gonzalez" if strpos(last_name, "Gonzalez") & length(last_name_clean) == 2
replace last_name_clean = "Dieu" if strpos(last_name, "Dieu") & length(last_name_clean) == 2
replace last_name_clean = "Krawczewicz" if strpos(last_name, "Krawczewicz") & length(last_name_clean) == 2
replace last_name_clean = "Ngathi" if strpos(last_name, "Ngathi") & length(last_name_clean) == 2
replace last_name_clean = "Tran" if strpos(last_name, "Tran") & length(last_name_clean) == 2
replace last_name_clean = "Yu" if strpos(last_name, "Yu") & length(last_name_clean) == 2

gen examiner_name_nomid = first_name_clean + " " + last_name_clean
order examiner_name_nomid, after(examiner_name)
order examiner_role, after(examiner_name_nomid)
gsort patent_id -examiner_role

* Keep only primary examiner
drop if examiner_role == "assistant"

* Export dataset
replace first_name_clean = lower(first_name_clean)
save $dir/temp/g_examiner_clean.dta, replace
export delimited using $dir/cleandata/g_examiner_clean.csv, replace

* Export firstname to genderize
preserve
keep first_name_clean
sort first_name_clean
duplicates drop first_name_clean, force
export delimited using $dir/temp/g_examiner_clean_firstname.csv, replace
restore



*** ====================================================================================================
*** This part merges examiners' gender
global dir /Volumes/Zihao_SSD2/PatentsView

import delimited $dir/temp/g_examiner_firstname_gendered_r3.csv, clear
replace gender = "ambiguous" if gender == ""
rename (round count probability) (gender_round gender_count gender_prob)
drop name_cleaned name

merge 1:m first_name_clean using $dir/temp/g_examiner_clean.dta
drop if _merge != 3
drop _merge
format %20s first_name_clean last_name_clean first_name last_name
sort patent_id examiner_name

* Different thresholds for examiner gender
gen e_gender_09_0 = gender
replace e_gender_09_0 = "ambiguous" if gender_prob<0.9
gen e_gender_09_50 = gender
replace e_gender_09_50 = "ambiguous" if gender_prob<0.9 | gender_count<50
gen e_gender_09_100 = gender
replace e_gender_09_100 = "ambiguous" if gender_prob<0.9 | gender_count<100

gen e_gender_08_0 = gender
replace e_gender_08_0 = "ambiguous" if gender_prob<0.8
gen e_gender_08_50 = gender
replace e_gender_08_50 = "ambiguous" if gender_prob<0.8 | gender_count<50
gen e_gender_08_100 = gender
replace e_gender_08_100 = "ambiguous" if gender_prob<0.8 | gender_count<100

drop examiner_name_nomid first_name_clean last_name_clean first_name last_name gender_round gender_count gender gender_prob

order patent_id examiner_name examiner_role e_gender_09_0 e_gender_09_50 e_gender_09_100
sort patent_id examiner_role

* The examiner always corresponds to the citing patent
rename patent_id patent_id_i

*** Export dataset
save $dir/temp/g_examiner_gender_temp.dta, replace





